Geographic Mapping

Loading in the coordinates

## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_logical(),
##   X1 = col_double()
## )
## See spec(...) for full column specifications.

Map Plotting

*This will show us a map of the whole world, with the countries coloured based on their most common topic of study

pl <- ggplot() + 
  geom_polygon(data = world_data2, aes(x = long, y = lat, group = group, fill = log(Count),text=Count)) +
  #geom_point(data=data,aes(x = long, y = lat),alpha=0.5,size=0.5,colour="grey")+
  coord_fixed(1.3)+
  scale_fill_viridis()+
  theme_void()
## Warning: Ignoring unknown aesthetics: text
ggplotly(pl,tooltip = "text")
## R Markdown
###get only europe

filt_bbox <- sf::st_bbox(c(xmin = -9, 
                           ymin = 36, 
                           xmax = 42.5, 
                           ymax = 70.1), 
                         crs = st_crs(4326)) %>% 
  sf::st_as_sfc(.)


find_data <- sf::st_within(world_sf, filt_bbox)
## although coordinates are longitude/latitude, st_within assumes that they are planar
#> although coordinates are longitude/latitude, st_within assumes that they are planar
europe_sf <- world_sf[which(lengths(find_data) != 0), ]

europe_result <- st_within(point_sf, europe_sf, sparse = FALSE)
## although coordinates are longitude/latitude, st_within assumes that they are planar
# Calculate the total count of each polygon
# Store the result as a new column "Count" in world_sf
europe_sf <- europe_sf %>%
  mutate(Count = apply(europe_result, 2, sum))

# Convert world_sf to a data frame world_df 
europe_df <- europe_sf
st_geometry(europe_df) <- NULL

# Get world data frame
world_data <- map_data("world")

# Merge world_data and world_df
europe_data <- europe_df %>%
  left_join(world_data, by = c("region"))

ind <- sf::st_intersects(point_sf, europe_sf)
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
points_europe<-  point_sf[which(lengths(ind) != 0), ]
points_europe <- cbind(points_europe,st_coordinates(points_europe))
points_europe=points_europe[,-c(6,7)]

*Now we are plotting the number of papers that come out of each country in europe and also adding in the locations of the insitutions.

europe_data=read.csv("https://raw.githubusercontent.com/dy-lin/hs19-trends/master/workshop/data/europe_data.csv")
points_europe=read.csv("https://raw.githubusercontent.com/dy-lin/hs19-trends/master/workshop/data/europe_points.csv")
text=paste(europe_data$region,europe_data$Count, sep=";")

pl <- ggplot() + 
  geom_polygon(data = europe_data, aes(x = long, y = lat, group = group, fill = log(Count))) +
  geom_point(data=points_europe,aes(x=X,y=Y,text=str_wrap(affiliation,50)),alpha=0.5,size=0.5,colour="grey")+
  coord_fixed(1.3)+
  scale_fill_viridis()+
  theme_void()
## Warning: Ignoring unknown aesthetics: text
ggplotly(pl,tooltip="text")

Bigrams

Let’s load our packages

library(tidyverse)
library(tidytext)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(widyr)
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## The following object is masked from 'package:rgeos':
## 
##     union
## The following object is masked from 'package:plotly':
## 
##     groups
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(ggplot2)
library(ggraph)
## 
## Attaching package: 'ggraph'
## The following object is masked from 'package:sp':
## 
##     geometry
library(readr)
library(tidygraph)
## 
## Attaching package: 'tidygraph'
## The following object is masked from 'package:igraph':
## 
##     groups
## The following object is masked from 'package:stats':
## 
##     filter

Load in the CSV, see what topic options we have available

df=read.csv("https://raw.githubusercontent.com/dy-lin/hs19-trends/master/workshop/data/bigrams.csv")
print(kable(unique(df$topic)))
## 
## 
## |x                     |
## |:---------------------|
## |Assembly              |
## |Databases             |
## |Epigenetics           |
## |Gene Expression       |
## |Genome Annotation     |
## |Phylogenetics         |
## |Sequence Alignment    |
## |Sequencing            |
## |Structural Prediction |
## |Variant Calling       |

*This takes the bigram frequency determination and plotting and wraps it in one function, visualize_bigrams().The plotting starts on line 228

visualize_bigrams <- function(df_name, textfield, topic_title){
  
    # Create frequencies of bigrams
    df_cleaned <- df_name %>% 
      mutate(textfield_clean = removeWords(gsub("[^A-Za-z0-9 ]", "", {{textfield}}), stop_words$word))
    
    df_bigrams <- df_cleaned %>%
      unnest_tokens(bigrams, textfield_clean, token = "ngrams", n = 2)
    
    df_freq <- as.data.frame(table(df_bigrams$bigrams)) %>% 
      arrange(desc(Freq))
    
    # Visualizations
    df_top_bigrams <- df_freq %>%
      top_n(100, Freq) %>% 
      separate(Var1, c("word1", "word2"))
    
    top_bigram_words <- c(df_top_bigrams$word1, df_top_bigrams$word2) %>%
      unique()
    
    word_list <- df_cleaned %>%
      unnest_tokens(words, textfield_clean, token = "ngrams", n = 1) 
    
    df_word_list <- as.data.frame(table(word_list$words)) %>% 
      arrange(desc(Freq)) %>%
      filter(Var1 %in% top_bigram_words)
    
    names(df_word_list)[2] <- "Term_Frequency"
    names(df_top_bigrams)[3] <- "Edge_Frequency"
    
    graph_from_data_frame(vertices =  df_word_list, d = df_top_bigrams) -> graph_hold
    
       pl <- graph_hold %>%
      ggraph(layout = "fr") +
      geom_edge_link(aes(edge_alpha = Edge_Frequency), show.legend = TRUE) +
      geom_node_point(aes(color = Term_Frequency, size = Term_Frequency), alpha = 0.7) +
      scale_fill_viridis_c() +
      geom_node_text(aes(label = name), repel = TRUE) +
      scale_color_viridis_c(direction = -1) +
      theme_void() +
      guides(size=FALSE) +
      labs(title = quo_name(topic_title)) +
      theme(plot.title = element_text(size = 26, face = "bold"))
    #ggsave(pl,filename = paste0("../figures/", "bigrams_", str_to_lower(str_replace(topic_title, "\\s", "_")), ".png")
           #,width = 12
           #,height = 8)
    pl
}

Now that we have the function made, decide on a topic and make a bigram digram for those topics

df_assembly <- df %>% 
  filter(topic == "Assembly")
visualize_bigrams(df_assembly, abstract, "")
## Warning: Factor `journal` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `affiliation` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `abstract` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `journal` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `affiliation` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `abstract` contains implicit NA, consider using
## `forcats::fct_explicit_na`